## [1] "LC_COLLATE=Chinese (Simplified)_China.936;LC_CTYPE=Chinese (Simplified)_China.936;LC_MONETARY=Chinese (Simplified)_China.936;LC_NUMERIC=C;LC_TIME=Chinese (Simplified)_China.936"
## [1] "C:/Users/sunwe/Desktop/EDA_SAMPLE/项目"
## 'data.frame': 328553 obs. of 21 variables:
## $ ListingId : chr "126541" "133291" "142421" "149711" ...
## $ 借款金额 : num 18000 9453 27000 25000 20000 ...
## $ 借款期限 : int 12 12 24 12 6 12 6 12 12 6 ...
## $ 借款利率 : num 18 20 20 18 16 14 18 18 16 18 ...
## $ 借款成功日期 : Date, format: "2015-05-04" "2015-03-16" ...
## $ 初始评级 : Factor w/ 6 levels "A","B","C","D",..: 3 4 5 3 3 1 5 2 2 3 ...
## $ 借款类型 : Factor w/ 4 levels "APP闪电","电商",..: 4 4 3 4 2 3 2 3 4 2 ...
## $ 是否首标 : Factor w/ 2 levels "否","是": 1 1 1 1 1 1 1 1 1 1 ...
## $ 年龄 : int 35 34 41 34 24 36 27 32 33 25 ...
## $ 性别 : Factor w/ 2 levels "男","女": 1 1 1 1 1 1 2 1 2 1 ...
## $ 手机认证 : Factor w/ 2 levels "成功认证","未成功认证": 1 2 1 1 1 1 1 1 1 1 ...
## $ 户口认证 : Factor w/ 2 levels "成功认证","未成功认证": 2 1 2 1 1 1 1 1 2 1 ...
## $ 视频认证 : Factor w/ 2 levels "成功认证","未成功认证": 1 2 2 1 1 1 1 1 1 1 ...
## $ 学历认证 : Factor w/ 2 levels "成功认证","未成功认证": 2 2 2 2 2 2 2 2 2 2 ...
## $ 征信认证 : Factor w/ 2 levels "成功认证","未成功认证": 2 2 2 2 2 2 2 2 2 2 ...
## $ 淘宝认证 : Factor w/ 2 levels "成功认证","未成功认证": 2 2 2 2 2 2 2 2 2 2 ...
## $ 历史成功借款次数: int 11 4 5 6 13 7 15 7 7 9 ...
## $ 历史成功借款金额: num 40326 14500 21894 36190 77945 ...
## $ 总待还本金 : num 8713 7891 11726 9703 0 ...
## $ 历史正常还款期数: int 57 13 25 41 118 56 75 52 41 49 ...
## $ 历史逾期还款期数: int 16 1 3 1 14 0 8 0 2 4 ...
## 'data.frame': 3203276 obs. of 10 variables:
## $ ListingId : chr "126541" "126541" "126541" "126541" ...
## $ 期数 : int 1 2 3 4 5 6 7 8 9 10 ...
## $ 还款状态 : int 1 1 1 1 1 1 2 1 2 2 ...
## $ 应还本金 : num 1380 1401 1422 1443 1465 ...
## $ 应还利息 : num 270 249 228 207 185 ...
## $ 剩余本金 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ 剩余利息 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ 到期日期 : Date, format: "2015-06-04" "2015-07-04" ...
## $ 还款日期 : Date, format: "2015-06-04" "2015-07-04" ...
## $ recorddate: Date, format: "2017-02-22" "2017-02-22" ...
###将认证情况进行转化和合并
df.lc$mobile <- ifelse(df.lc$手机认证 == '成功认证',1,0)
df.lc$house <-ifelse(df.lc$户口认证 == '成功认证',1,0)
df.lc$vedio <- ifelse(df.lc$视频认证 == '成功认证',1,0)
df.lc$diploma <- ifelse(df.lc$学历认证 == '成功认证',1,0)
df.lc$credit <- ifelse(df.lc$征信认证 == '成功认证',1,0)
df.lc$taobao <- ifelse(df.lc$淘宝认证 == '成功认证',1,0)
df.lc <- transform(df.lc, all.auth = mobile + house + vedio
+ diploma + credit + taobao)
list_groups <- group_by(df.lp, ListingId)
df.lclp_by_list <- summarise(list_groups,
periods= max(期数),
repay_principle = sum(应还本金),
repay_interest = sum(应还利息),
residual_principle = sum(剩余本金),
residual_interest = sum(剩余利息))
df.lctrack <- merge(df.lc, df.lclp_by_list, all=FALSE)
str(df.lctrack)
## 'data.frame': 328553 obs. of 33 variables:
## $ ListingId : chr "10000021" "10000081" "10000101" "10000181" ...
## $ 借款金额 : num 2390 2000 7500 2093 3500 ...
## $ 借款期限 : int 6 12 12 12 12 12 12 12 6 12 ...
## $ 借款利率 : num 20 22 20 22 24 22 22 22 20 22 ...
## $ 借款成功日期 : Date, format: "2016-03-23" "2016-03-23" ...
## $ 初始评级 : Factor w/ 6 levels "A","B","C","D",..: 3 4 3 4 5 4 4 4 3 4 ...
## $ 借款类型 : Factor w/ 4 levels "APP闪电","电商",..: 3 4 3 3 3 1 4 3 1 4 ...
## $ 是否首标 : Factor w/ 2 levels "否","是": 1 1 2 1 2 2 1 1 1 1 ...
## $ 年龄 : int 23 30 27 26 32 27 43 23 20 37 ...
## $ 性别 : Factor w/ 2 levels "男","女": 1 1 2 2 2 2 1 1 1 1 ...
## $ 手机认证 : Factor w/ 2 levels "成功认证","未成功认证": 1 1 1 1 1 2 1 1 2 2 ...
## $ 户口认证 : Factor w/ 2 levels "成功认证","未成功认证": 2 2 2 2 2 2 2 2 2 2 ...
## $ 视频认证 : Factor w/ 2 levels "成功认证","未成功认证": 2 2 2 2 2 2 2 2 2 2 ...
## $ 学历认证 : Factor w/ 2 levels "成功认证","未成功认证": 2 2 1 2 1 2 2 2 2 1 ...
## $ 征信认证 : Factor w/ 2 levels "成功认证","未成功认证": 2 2 2 2 2 2 2 2 2 2 ...
## $ 淘宝认证 : Factor w/ 2 levels "成功认证","未成功认证": 2 2 2 2 2 2 2 2 2 2 ...
## $ 历史成功借款次数 : int 3 1 0 3 0 0 2 1 2 1 ...
## $ 历史成功借款金额 : num 12058 5000 0 13008 0 ...
## $ 总待还本金 : num 6610 3850 0 5907 0 ...
## $ 历史正常还款期数 : int 14 3 0 16 0 0 12 3 4 4 ...
## $ 历史逾期还款期数 : int 0 0 0 0 0 0 1 1 0 0 ...
## $ mobile : num 1 1 1 1 1 0 1 1 0 0 ...
## $ house : num 0 0 0 0 0 0 0 0 0 0 ...
## $ vedio : num 0 0 0 0 0 0 0 0 0 0 ...
## $ diploma : num 0 0 1 0 1 0 0 0 0 1 ...
## $ credit : num 0 0 0 0 0 0 0 0 0 0 ...
## $ taobao : num 0 0 0 0 0 0 0 0 0 0 ...
## $ all.auth : num 1 1 2 1 2 0 1 1 0 1 ...
## $ periods : num 6 8 7 12 12 12 12 12 6 12 ...
## $ repay_principle : num 2390 2000 7500 2093 3500 ...
## $ repay_interest : num 141 198 633 258 471 ...
## $ residual_principle: num 0 0 0 1612 643 ...
## $ residual_interest : num 0 0 0 151.3 19.3 ...
第一部分:单变项分析
qplot(data = df.lc, x =借款金额/1000, fill = I('#F79420'))+
xlim(0, quantile(df.lc$借款金额/1000, 0.99))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3278 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
summary(df.lc$借款金额)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 100 2033 3397 4424 5230 500000
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 100 2033 3397 4424 5230 500000
单变项分析:1)超过50%的贷款在2000元-5000元的区间内,最低100元/单,最高50万元/单。
qplot(x= 借款期限, data = df.lc, fill = I('#FFFF00'))+
scale_x_continuous(breaks = seq(0,30,1))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary(df.lc$借款期限)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 6.00 12.00 10.21 12.00 24.00
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 1.00 6.00 12.00 10.21 12.00 24.00
单变项分析:2)贷款期数主要集中在6个月和12个月,其它期数的分布较少,最短1个月,最长12个月
qplot(x= 借款利率, data = df.lc, fill = I('#87CEFF'))+
scale_x_continuous(limits = c(10,25),breaks = seq(10,25,1))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 15 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
summary(df.lc$借款利率)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.5 20.0 20.0 20.6 22.0 24.0
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 6.5 20.0 20.0 20.6 22.0 24.0
单变项分析:3)75%的贷款利率在年化20%及以上(主要为18%、20%和22%),最低6.5%,最高24%。
qplot(x= 借款成功日期, data = df.lc, fill = I('#FF0000'))+
scale_y_continuous(limits = c(0,32000),breaks = seq(0,32000,5000))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing missing values (geom_bar).
summary(df.lc$借款成功日期)
## Min. 1st Qu. Median Mean 3rd Qu.
## "2015-01-01" "2016-05-08" "2016-09-11" "2016-08-05" "2016-12-01"
## Max.
## "2017-01-30"
#Min. 1st Qu. Median Mean 3rd Qu. Max.
#"2015-01-01" "2016-05-08" "2016-09-11" "2016-08-05" "2016-12-01" "2017-01-30"
单变项分析:4)2015年1月1日至2017年1月30日,成功标量快速提高,最高3200单左右/天
rank_groups <- group_by(df.lc, 初始评级)
df.lc_by_rank <- summarise(rank_groups,
amount_mean = mean(借款金额),
amount_median = median(借款金额),
n = n())
qplot(x= 初始评级, y= n/1000, data = df.lc_by_rank)+
geom_bar(stat = 'identity')+
scale_y_continuous(limits = c(0,150), breaks = seq(0,150,10))
df.lc_by_rank
## # A tibble: 6 x 4
## 初始评级 amount_mean amount_median n
## <fct> <dbl> <dbl> <int>
## 1 A 5889. 3100. 10284
## 2 B 7533. 3100 33188
## 3 C 4139. 3300 131705
## 4 D 3926. 3500 134860
## 5 E 3743. 4000 17027
## 6 F 3081. 3000 1489
单变项分析:5)初始评级为C和D的标占总体为81.13%。
category_groups <- group_by(df.lc, 借款类型)
df.lc_by_category <- summarise(category_groups,
amount_mean = mean(借款金额),
amount_median = median(借款金额),
n = n())
qplot(x= 借款类型, y= n/1000, data = df.lc_by_category)+
geom_bar(stat = 'identity')+
scale_y_continuous(limits = c(0,150), breaks = seq(0,150,10))
df.lc_by_category
## # A tibble: 4 x 4
## 借款类型 amount_mean amount_median n
## <fct> <dbl> <dbl> <int>
## 1 APP闪电 2765. 1940 112079
## 2 电商 115368. 50000 1069
## 3 普通 5245. 4679 118103
## 4 其他 4119. 3329 97302
单变项分析:6)普通、APP闪电和其它贷款的占比为:35.95%、34.11%和29.61%。
## 否 是
## 241090 87463
单变项分析:7)非首标占比总体73.38%, 非首标是首标的2.76倍,即大多数情况下为老客户。
qplot(data = df.lc, x = 年龄, fill = I('#2E8B57'))+
scale_x_continuous(limits = c(15,60), breaks = seq(10,60,5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
summary(df.lc$年龄)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 17.00 24.00 28.00 29.14 33.00 56.00
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 17.00 24.00 28.00 29.14 33.00 56.00
单变项分析:8)17-33岁的借款数占比总体75%,最大年龄借款为56岁。
qplot(x= 性别, data = df.lc)
summary(df.lc$性别)
## 男 女
## 221946 106607
# 男 女
#221946 106607
单变项分析:9)标的男女比例为2:1
lc_phone<-ggplot(aes(x=手机认证),data=df.lc)+geom_bar()+
scale_y_continuous(limits = c(0,350000), breaks = seq(0,350000,50000))
lc_account<-ggplot(aes(x=户口认证),data=df.lc)+geom_bar()+
scale_y_continuous(limits = c(0,350000), breaks = seq(0,350000,50000))
lc_vedio<-ggplot(aes(x=视频认证),data=df.lc)+geom_bar()+
scale_y_continuous(limits = c(0,350000), breaks = seq(0,350000,50000))
lc_education<-ggplot(aes(x=学历认证),data=df.lc)+geom_bar()+
scale_y_continuous(limits = c(0,350000), breaks = seq(0,350000,50000))
lc_credit<-ggplot(aes(x=征信认证),data=df.lc)+geom_bar()+
scale_y_continuous(limits = c(0,350000), breaks = seq(0,350000,50000))
lc_taobao<-ggplot(aes(x=淘宝认证),data=df.lc)+geom_bar()+
scale_y_continuous(limits = c(0,350000), breaks = seq(0,350000,50000))
grid.arrange(lc_phone,lc_account,lc_vedio,lc_education,
lc_credit,lc_taobao,ncol=3)
单变项分析:10)手机认证和学历认证的覆盖面明显高于其它认证渠道
qplot(x= 历史成功借款次数, data = df.lc)+
xlim(0, quantile(df.lc$历史成功借款次数, 0.99))+
scale_x_continuous(limits = c(-1,13), breaks = seq(-1,13,1))
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2041 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
summary(df.lc$历史成功借款次数)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 2.000 2.323 3.000 649.000
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.000 0.000 2.000 2.323 3.000 649.000
单变项分析:11)75%的标的顾客有至少1次成功借款经历。
qplot(x= 历史成功借款金额/1000, data = df.lc)+
xlim(0, quantile(df.lc$历史成功借款金额/1000, 0.99))+
scale_y_continuous(limits = c(0,35000), breaks = seq(0,35000,5000))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3286 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_bar).
summary(subset(df.lc$历史成功借款金额,df.lc$历史成功借款金额 >0))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 100 4000 7473 11973 13082 7405926
#all_loans
#Min. 1st Qu. Median Mean 3rd Qu. Max.
#0 0 5000 8786 10355 7405926
#loans whose historical amount is greater than 0
#Min. 1st Qu. Median Mean 3rd Qu. Max.
#100 4000 7473 11973 13082 7405926
单变项分析:12)凡有历史成功借款金额平均金额为11,973元,中位数(金额)为7473元
df.lctrack$status <-ifelse(df.lctrack$residual_principle>0,'已逾期','正常还款')
lctrack_status <-ggplot(aes(x= status),data=df.lctrack)+geom_bar()+
scale_y_continuous(limits = c(0,250000), breaks = seq(0,250000,50000))
lctrack_status
status_groups <- group_by(df.lctrack, status)
df.lc_by_status <- summarise(status_groups,
n = n(),
percent_n= n()/328553)
df.lc_by_status
## # A tibble: 2 x 3
## status n percent_n
## <chr> <int> <dbl>
## 1 已逾期 233245 0.710
## 2 正常还款 95308 0.290
单变项分析:13)期间在所有的贷款中,71%存在不同程度的预期,29%正常还款。
将拍拍贷的LC和LP(按照ListingID汇总)进行合并,基本结构如下: ‘data.frame’: 328553 obs. of 26 variables: $ ListingId : chr “10000021” “10000081” “10000101” “10000181” … $ 借款金额 : num 2390 2000 7500 2093 3500 … $ 借款期限 : int 6 12 12 12 12 12 12 12 6 12 … $ 借款利率 : num 20 22 20 22 24 22 22 22 20 22 … $ 借款成功日期 : Date, format: “2016-03-23” “2016-03-23” “2016-03-26” … $ 初始评级 : Factor w/ 6 levels “A”,“B”,“C”,“D”,..: 3 4 3 4 5 4 4 4 3 4 … $ 借款类型 : Factor w/ 4 levels “APP闪电”,“电商”,..: 3 4 3 3 3 1 4 3 1 4 … $ 是否首标 : Factor w/ 2 levels “否”,“是”: 1 1 2 1 2 2 1 1 1 1 … $ 年龄 : int 23 30 27 26 32 27 43 23 20 37 … $ 性别 : Factor w/ 2 levels “男”,“女”: 1 1 2 2 2 2 1 1 1 1 … $ 手机认证 : Factor w/ 2 levels “成功认证”,“未成功认证”: 1 1 1 1 1 2 1 1 2 2 … $ 户口认证 : Factor w/ 2 levels “成功认证”,“未成功认证”: 2 2 2 2 2 2 2 2 2 2 … $ 视频认证 : Factor w/ 2 levels “成功认证”,“未成功认证”: 2 2 2 2 2 2 2 2 2 2 … $ 学历认证 : Factor w/ 2 levels “成功认证”,“未成功认证”: 2 2 1 2 1 2 2 2 2 1 … $ 征信认证 : Factor w/ 2 levels “成功认证”,“未成功认证”: 2 2 2 2 2 2 2 2 2 2 … $ 淘宝认证 : Factor w/ 2 levels “成功认证”,“未成功认证”: 2 2 2 2 2 2 2 2 2 2 … $ 历史成功借款次数 : int 3 1 0 3 0 0 2 1 2 1 … $ 历史成功借款金额 : num 12058 5000 0 13008 0 … $ 总待还本金 : num 6610 3850 0 5907 0 … $ 历史正常还款期数 : int 14 3 0 16 0 0 12 3 4 4 … $ 历史逾期还款期数 : int 0 0 0 0 0 0 1 1 0 0 … $ periods : num 6 8 7 12 12 12 12 12 6 12 … $ repay_principle : num 2390 2000 7500 2093 3500 … $ repay_interest : num 141 198 633 258 471 … $ residual_principle: num 0 0 0 1612 643 … $ residual_interest : num 0 0 0 151.3 19.3 …
1.拍拍贷的借款人特征:年龄、单笔借款金额、借款期限、年龄、性别、认证方式、历史借款情况 2.贷款业务发展趋势:贷款笔数、逾期情况(逾期金额/借款金额)
贷款的初始评级,即个人的初始信用评级
创建了如下变项: 1.通过认证了的渠道数(all.auth) 2.status:是否在该数据时间段内出现还款延期的情况
存在异常分布的3个变项为:借款金额、历史成功借款次数和历史成功借款金额;在制作图表的时候,按照分布在99%范围的数据进行显示,避免对大多数情况下的展示有影响;但原始数据不做调整
单变项发现的基本特征: 1)超过50%的贷款在2000元-5000元的区间内,最低100元/单,最高50万元/单 2)贷款期数主要集中在6个月和12个月,其它期数的分布较少,最短1个月,最长12个月 3)75%的贷款利率在年化20%及以上(主要为18%、20%和22%),最低6.5%,最该24%。 4)2015年1月1日至2017年1月30日,成功标量快速提高,从起始的单日100单以下到最高3200单左右/天 5)初始评级为C和D的标占总体为81.13% 6)普通、APP闪电和其它贷款的占比为:35.95%、34.11%和29.61% 7)非首标占比总体73.38%, 非首标是首标的2.76倍,即大多数情况下为老客户 8)17-33岁的借款数占比总体75%,最大年龄借款为56岁 9)标的男女比例为2:1 10)手机认证和学历认证的覆盖面明显高于其它认证渠道,与大学生群体可能有交集 11)75%的标的顾客有至少1次成功借款经历 12)凡有历史成功借款金额平均金额为11,973元,中位数(金额)为7,473元 13)期间在所有的贷款中,71%存在不同程度的逾期,29%正常还款。
df.lc$age.bucket <- cut(df.lc$年龄, c(16,26,36,46,56))
age_groups <- group_by(df.lc, age.bucket)
df.lc_by_age <- summarise(age_groups,
amount_mean = mean(借款金额),
amount_median = median(借款金额),
n = n())
df.lc_by_age
## # A tibble: 4 x 4
## age.bucket amount_mean amount_median n
## <fct> <dbl> <dbl> <int>
## 1 (16,26] 3469. 3000 134911
## 2 (26,36] 4928. 3777 148736
## 3 (36,46] 5613. 4178 38109
## 4 (46,56] 5661. 4500 6797
ggplot(aes(x= age.bucket, y= 借款金额, fill = I('#F79420')), data= df.lc) + geom_boxplot()+
scale_y_continuous(limits = c(0,10000), breaks = seq(0,10000,1000))
## Warning: Removed 9251 rows containing non-finite values (stat_boxplot).
cor.test(df.lc$年龄, df.lc$借款金额, method = 'spearman')
## Warning in cor.test.default(df.lc$年龄, df.lc$借款金额, method =
## "spearman"): Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: df.lc$年龄 and df.lc$借款金额
## S = 4.3088e+15, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.2710632
双变项分析:1)按照年龄组的增长,平均借款金额逐渐提高
ggplot(aes(x = 是否首标 , y= 借款金额, fill = I('#F79420')), data= df.lc) + geom_boxplot()+
scale_y_continuous(limits = c(0,6000), breaks = seq(0,6000,1000))
## Warning: Removed 54911 rows containing non-finite values (stat_boxplot).
双变项分析:2)首标的平均借款金额高于非首标的借款金额
df.lc_by_rank
## # A tibble: 6 x 4
## 初始评级 amount_mean amount_median n
## <fct> <dbl> <dbl> <int>
## 1 A 5889. 3100. 10284
## 2 B 7533. 3100 33188
## 3 C 4139. 3300 131705
## 4 D 3926. 3500 134860
## 5 E 3743. 4000 17027
## 6 F 3081. 3000 1489
ggplot(aes(x= 初始评级 , y= 借款金额), data= df.lc) + geom_boxplot(fill = I('#FF0000'))+
scale_y_continuous(limits = c(0,10000), breaks = seq(0,10000,1000))
## Warning: Removed 9251 rows containing non-finite values (stat_boxplot).
双变项分析:3)借款金额随着初始评级的降低(A-E),借款金额逐步升高,F级借款金额下降(根据中位数判断)。
ggplot(aes(x= 初始评级 , y= 借款利率), data= df.lc) + geom_boxplot()+
ylim(quantile(df.lc$借款利率, 0.025), quantile(df.lc$借款利率, 0.975))
## Warning: Removed 1502 rows containing non-finite values (stat_boxplot).
双变项分析:4)借款金额随着初始评级的降低(A-E),借款利率随之升高,而在F级借款利率下降
ggplot(aes(x= 借款类型 , y= 借款金额), data= df.lc) + geom_boxplot()+
ylim(0, quantile(df.lc$借款金额, 0.99))
## Warning: Removed 3278 rows containing non-finite values (stat_boxplot).
双变项分析:5)借款金额由小到大,一般分布为APP闪电、其他、普通和电商
df.lc_new <- subset(df.lc,df.lc$历史成功借款金额>0)
ggplot(aes(x= 历史成功借款金额 , y= 借款金额), data= df.lc_new) +
geom_point(alpha = 1/100)+
ylim(0, quantile(df.lc_new$借款金额, 0.99))+
xlim(0, quantile(df.lc_new$历史成功借款金额, 0.99))+
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 3963 rows containing non-finite values (stat_smooth).
## Warning: Removed 3963 rows containing missing values (geom_point).
cor.test(df.lc_new$历史成功借款金额, df.lc_new$借款金额, method = 'pearson')
##
## Pearson's product-moment correlation
##
## data: df.lc_new$历史成功借款金额 and df.lc_new$借款金额
## t = 399.45, df = 241090, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6286641 0.6334682
## sample estimates:
## cor
## 0.6310722
双变项分析:6)在存在历史借款金额的情况下,历史成功借款金额和借款金额的相关性呈现正相关关系,相关系数:0.631
ggplot(aes(x= 借款利率 , y= 借款金额), data= df.lc) +
geom_point()+
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
cor.test(df.lc$借款期限, df.lc$借款金额, method = 'spearman')
## Warning in cor.test.default(df.lc$借款期限, df.lc$借款金额, method =
## "spearman"): Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: df.lc$借款期限 and df.lc$借款金额
## S = 5.3367e+15, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.09716509
双变项分析:7)借款金额随着借款利率的升高而下降
ggplot(aes(x= all.auth , y= 借款金额/1000), data= df.lc) +
geom_jitter(alpha = 1/100)+
ylim(0, quantile(df.lc$借款金额/1000, 0.99))+
scale_x_continuous(breaks = seq(0,6,1))+
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 3278 rows containing non-finite values (stat_smooth).
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
## Warning: Removed 3297 rows containing missing values (geom_point).
cor.test(df.lc$all.auth, df.lc$借款金额, method = 'spearman')
## Warning in cor.test.default(df.lc$all.auth, df.lc$借款金额, method =
## "spearman"): Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: df.lc$all.auth and df.lc$借款金额
## S = 4.5439e+15, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.2312849
auth_groups <- group_by(df.lc, all.auth)
df.lc_by_auth <- summarise(auth_groups,
amount_mean = mean(借款金额),
amount_median = median(借款金额),
n = n())
df.lc_by_auth
## # A tibble: 7 x 4
## all.auth amount_mean amount_median n
## <dbl> <dbl> <dbl> <int>
## 1 0 3337. 2928. 131776
## 2 1 4427. 3824 133598
## 3 2 5002. 3748 50622
## 4 3 10248. 3950 9143
## 5 4 21490. 4300 2859
## 6 5 25055. 4523 542
## 7 6 27436 12600 13
双变项分析:7)在所有的验证方式总数的基础上,随着验证数的增加,借款金额增加
df.lctrack$status <-ifelse(df.lctrack$residual_principle>0,'已逾期','正常还款')
df_lctrack_new <- subset(df.lctrack,df.lctrack$status =='已逾期')
ggplot(aes(x= 初始评级 , y= residual_principle/1000), data= df.lctrack) +
geom_boxplot()+
ylim(0, quantile(df.lctrack$residual_principle/1000, 0.95))
## Warning: Removed 16428 rows containing non-finite values (stat_boxplot).
rank_groups <- group_by(df.lctrack, 初始评级)
df.lc_by_rank <- summarise(rank_groups,
overdue_amount_ratio = sum(residual_principle)/sum(借款金额),
overdue_amount_mean = mean(residual_principle),
n = n())
df.lc_by_rank
## # A tibble: 6 x 4
## 初始评级 overdue_amount_ratio overdue_amount_mean n
## <fct> <dbl> <dbl> <int>
## 1 A 0.310 1827. 10284
## 2 B 0.245 1844. 33188
## 3 C 0.420 1740. 131705
## 4 D 0.487 1913. 134860
## 5 E 0.352 1318. 17027
## 6 F 0.425 1308. 1489
双变项分析:8)随着初始评级的下降,逾期本金金额和逾期金额占比有一定提高 注:预期本金金额占比 = 逾期金额/借款金额
qplot(x= residual_principle/1000,
y= ..count../sum(..count..),
xlab = '逾期金额(单位:千)',
ylab = '逾期金额占比',
data = df.lctrack,
geom = 'freqpoly', color = 初始评级) +
scale_x_continuous(limits = c(0,10), breaks = seq(0,10,1))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2348 rows containing non-finite values (stat_bin).
## Warning: Removed 12 rows containing missing values (geom_path).
双变项分析:9)初始评级为C和D的借款标的逾期情况明显高于其它评级,D略高于C
ggplot(aes(x = 借款成功日期, y = residual_principle/借款金额), data= df.lctrack)+
geom_point(alpha=1/100)+
scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.1))
双变项分析:10)2016年1月以后发行的标,逾期金额占比普遍明显增多
date_groups <- group_by(df.lctrack, 借款成功日期)
df.lctrack_by_date <- summarise(date_groups,
residual_ratio = sum(residual_principle)/sum(借款金额),
amount_median = median(借款金额),
n = n())
df.lctrack_by_date
## # A tibble: 756 x 4
## 借款成功日期 residual_ratio amount_median n
## <date> <dbl> <dbl> <int>
## 1 2015-01-01 0.0742 5106. 6
## 2 2015-01-02 0.0359 4692 6
## 3 2015-01-03 0 5192 4
## 4 2015-01-04 0.00595 5298. 30
## 5 2015-01-05 0.00521 3500 29
## 6 2015-01-06 0.0167 3070 13
## 7 2015-01-07 0.00572 5750 38
## 8 2015-01-08 0 3695 16
## 9 2015-01-09 0 4200 23
## 10 2015-01-10 0.0182 6020 5
## # ... with 746 more rows
ggplot(aes(x = 借款成功日期, y = residual_ratio), data= df.lctrack_by_date)+
geom_point()+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.1))
## <ScaleContinuousPosition>
## Range:
## Limits: 0 -- 1
双变项分析:11)2016年1月之后成功借款的标的,逾期金额占比借款金额快速升高。
对借款金额和逾期金额感兴趣;借款金额是针对客户服务的潜在成本,逾期金额是可能不能收回的成本
1)按照年龄组的增长,平均借款金额逐渐提高 2)首标的平均借款金额高于非首标的借款金额 3)借款金额随着初始评级的降低(A-E),借款金额逐步升高,F级借款金额下降(根据中位数判断) 4)随着初始评级的降低(A-E),借款利率随之升高,而在F级借款利率下降 5)借款金额由小到大,一般分布为APP闪电、其他、普通和电商 6)在存在历史借款金额的情况下,历史成功借款金额和借款金额的相关性呈现正相关关系,相关系数:0.631 7)在所有的验证方式总数的基础上,随着验证数的增加,借款金额增加 8)随着初始评级的下降,逾期本金金额和逾期金额占比有一定提高 9)初始评级为C和D的借款标的逾期情况明显高于其它评级,D略高于C 10)2016年1月以后发行的标,逾期金额占比普遍明显增多 11)2016年1月之后成功借款的标的,逾期金额占比借款金额(逾期率)快速升高。
最强的关系是:历史成功借款金额和借款金额的Pearson相关系数,0.631
ggplot(aes(x = 借款成功日期, y = 借款金额/1000, color= 初始评级), data = df.lctrack) +
geom_point(alpha = 1/20, size = 1, position = 'jitter') +
geom_smooth()+
scale_y_continuous(limits = c(0,10), breaks = seq(0,10,1)) +
scale_color_brewer(type = 'div',
guide = guide_legend(title = '初始评级', reverse = F,
override.aes = list(alpha = 1, size = 2)))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 9251 rows containing non-finite values (stat_smooth).
## Warning: Removed 10412 rows containing missing values (geom_point).
多变项分析:1)初始评级与借款金额基本没有特定关系
ggplot(aes(x = 借款成功日期, y = residual_principle/借款金额, color= 初始评级), data = df.lctrack) +
geom_point(alpha = 1/20, size = 1, position = 'jitter') +
geom_smooth()+
scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.1)) +
scale_color_brewer(type = 'div',
guide = guide_legend(title = '初始评级', reverse = F,
override.aes = list(alpha = 1, size = 2)))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 47657 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_smooth).
多变项分析:2)在2016年1月前,各初始评级的逾期金额占比分布相对正常,之后初始评级失效
ggplot(aes(x = 借款成功日期, y = 借款金额/1000, color= 是否首标), data = df.lctrack) +
geom_point(alpha = 1/20, size = 1, position = 'jitter') +
geom_smooth()+
scale_y_continuous(limits = c(0,10), breaks = seq(0,10,1)) +
scale_color_brewer(palette="Set1",
guide = guide_legend(title = '是否首标', reverse = F,
override.aes = list(alpha = 1, size = 2)))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 9251 rows containing non-finite values (stat_smooth).
## Warning: Removed 10356 rows containing missing values (geom_point).
多变项分析:3)在2015-01到2017-01发行的标的,首标的平均借款金额逐渐超过非首标的平均借款金额
ggplot(aes(x = 借款成功日期, y = residual_principle/借款金额, color= 是否首标), data = df.lctrack) +
geom_point(alpha = 1/20, size = 1, position = 'jitter') +
geom_smooth()+
scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.1)) +
scale_color_brewer(palette="Set1",
guide = guide_legend(title = '是否首标', reverse = F,
override.aes = list(alpha = 1, size = 2)))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 47506 rows containing missing values (geom_point).
多变项分析:4)在2015-01到2017-01发行的标的,首标的逾期金额占比始终高于非首标的逾期金额占比
df.lctrack$all.auth <-factor(df.lctrack$all.auth)
ggplot(aes(x = 借款成功日期, y = 借款金额/1000, color= all.auth), data = df.lctrack) +
geom_point(alpha = 1/20, size = 1, position = 'jitter') +
geom_smooth()+
scale_y_continuous(limits = c(0,10), breaks = seq(0,10,1)) +
scale_color_brewer(palette="Set1",
guide = guide_legend(title = '认证成功数', reverse = F,
override.aes = list(alpha = 1, size = 2)))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 9251 rows containing non-finite values (stat_smooth).
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
## Warning: Removed 10352 rows containing missing values (geom_point).
多变项分析:5)在2016-01之后,未进行认证的顾客显著增加
ggplot(aes(x = 借款成功日期, y = residual_principle/借款金额, color= all.auth), data = df.lctrack) +
geom_point(alpha = 1/20, size = 1, position = 'jitter') +
geom_smooth()+
scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.1)) +
scale_color_brewer(type = 'div',
guide = guide_legend(title = '认证成功数', reverse = F,
override.aes = list(alpha = 1, size = 2)))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 47753 rows containing missing values (geom_point).
## Warning: Removed 19 rows containing missing values (geom_smooth).
多变项分析:6)在2016-01之后,各个认证成功数的标的均出现逾期金额占比高企的情况
ggplot(aes(x = 借款成功日期, y = 借款金额/1000, color= 借款类型), data = df.lctrack) +
geom_point(alpha = 1/20, size = 1, position = 'jitter') +
geom_smooth()+
scale_y_continuous(limits = c(0,10), breaks = seq(0,10,0.5)) +
scale_color_brewer(palette="Set1",
guide = guide_legend(title = '借款类型', reverse = F,
override.aes = list(alpha = 1, size = 2)))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 9251 rows containing non-finite values (stat_smooth).
## Warning: Removed 10297 rows containing missing values (geom_point).
多变项分析:7)电商借款在2016年末不再有新的标的,而其它APP闪电、电商和其他在2016年1月后有不同程度的借款金额下降。
ggplot(aes(x = 借款成功日期, y = residual_principle/借款金额, color= 借款类型), data = df.lctrack) +
geom_point(alpha = 1/20, size = 1, position = 'jitter') +
geom_smooth()+
scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.1)) +
scale_color_brewer(palette="Set1",
guide = guide_legend(title = '是否首标', reverse = F,
override.aes = list(alpha = 1, size = 2)))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 47447 rows containing missing values (geom_point).
## Warning: Removed 7 rows containing missing values (geom_smooth).
多变项分析:8)在2016-01之前,逾期金额占比:其他>普通>电商,2016-01之后变化剧烈
#options(scipen= 200)
df.lctrack_new <- subset(df.lctrack,df.lctrack$历史成功借款金额>0)
ggplot(aes(x = 历史成功借款金额, y = 借款金额, color= 初始评级), data = df.lctrack_new) +
geom_point(alpha = 1/20, size = 1, position = 'jitter') +
scale_x_log10()+
scale_y_log10()+
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.1)) +
scale_color_brewer(palette="Set1",guide = guide_legend(title = '性别', reverse = F,
override.aes = list(alpha = 1, size = 2)))
## <ggproto object: Class ScaleDiscrete, Scale, gg>
## aesthetics: colour
## axis_order: function
## break_info: function
## break_positions: function
## breaks: waiver
## call: call
## clone: function
## dimension: function
## drop: TRUE
## expand: waiver
## get_breaks: function
## get_breaks_minor: function
## get_labels: function
## get_limits: function
## guide: guide, legend
## is_discrete: function
## is_empty: function
## labels: waiver
## limits: NULL
## make_sec_title: function
## make_title: function
## map: function
## map_df: function
## n.breaks.cache: NULL
## na.translate: TRUE
## na.value: NA
## name: waiver
## palette: function
## palette.cache: NULL
## position: left
## range: <ggproto object: Class RangeDiscrete, Range, gg>
## range: NULL
## reset: function
## train: function
## super: <ggproto object: Class RangeDiscrete, Range, gg>
## reset: function
## scale_name: brewer
## train: function
## train_df: function
## transform: function
## transform_df: function
## super: <ggproto object: Class ScaleDiscrete, Scale, gg>
多变项分析:9)在历史借款金额越高,借款金额越高的基础上,B级的跨度区间最大,C级和D级标的金额和历史借款金额高度集中
ggplot(aes(x = 历史成功借款金额, y = residual_principle/借款金额, color= 初始评级), data = df.lctrack_new) +
geom_point(alpha = 1/20, size = 1, position = 'jitter') +
scale_x_log10()+
geom_smooth()+
scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.1)) +
scale_color_brewer(palette="Set1",guide = guide_legend(title = '性别', reverse = F,
override.aes = list(alpha = 1, size = 2)))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 33137 rows containing missing values (geom_point).
## Warning: Removed 5 rows containing missing values (geom_smooth).
多变项分析:10)A-E初始评级的的逾期率均是先上升后下降
ggplot(aes(x = 历史成功借款金额, y = 借款金额, color= 借款类型), data = df.lctrack_new) +
geom_point(alpha = 1/20, size = 1, position = 'jitter') +
scale_x_log10()+
scale_y_log10()+
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.1)) +
scale_color_brewer(palette="Set1",guide = guide_legend(title = '性别', reverse = F,
override.aes = list(alpha = 1, size = 2)))
## <ggproto object: Class ScaleDiscrete, Scale, gg>
## aesthetics: colour
## axis_order: function
## break_info: function
## break_positions: function
## breaks: waiver
## call: call
## clone: function
## dimension: function
## drop: TRUE
## expand: waiver
## get_breaks: function
## get_breaks_minor: function
## get_labels: function
## get_limits: function
## guide: guide, legend
## is_discrete: function
## is_empty: function
## labels: waiver
## limits: NULL
## make_sec_title: function
## make_title: function
## map: function
## map_df: function
## n.breaks.cache: NULL
## na.translate: TRUE
## na.value: NA
## name: waiver
## palette: function
## palette.cache: NULL
## position: left
## range: <ggproto object: Class RangeDiscrete, Range, gg>
## range: NULL
## reset: function
## train: function
## super: <ggproto object: Class RangeDiscrete, Range, gg>
## reset: function
## scale_name: brewer
## train: function
## train_df: function
## transform: function
## transform_df: function
## super: <ggproto object: Class ScaleDiscrete, Scale, gg>
多变项分析:11)APP闪电贷和普通贷款分布最多,且金额集中
ggplot(aes(x = 历史成功借款金额, y = residual_principle/借款金额, color= 借款类型), data = df.lctrack_new) +
geom_point(alpha = 1/20, size = 1, position = 'jitter') +
scale_x_log10()+
geom_smooth()+
scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.1)) +
scale_color_brewer(palette="Set1",guide = guide_legend(title = '借款类型', reverse = F,
override.aes = list(alpha = 1, size = 2)))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 33147 rows containing missing values (geom_point).
## Warning: Removed 18 rows containing missing values (geom_smooth).
多变项分析:12)APP闪电、普通和其他贷款逾期金额占比,随着历史成功借款金额增多而增多;而电商贷款是随着历史成功借款金额的增多而减少。
df.lctrack$age.bucket <- cut(df.lctrack$年龄, c(16,26,36,46,56))
ggplot(aes(x = 历史成功借款金额, y = 借款金额, color= age.bucket), data = df.lctrack) +
geom_point(alpha = 1/20, size = 1, position = 'jitter') +
geom_smooth()+
scale_x_log10()+
scale_y_log10()+
#scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.1)) +
scale_color_brewer(palette="Set1",guide = guide_legend(title = '年龄分组', reverse = F,
override.aes = list(alpha = 1, size = 2)))
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 87463 rows containing non-finite values (stat_smooth).
多变项分析:13)两个年龄层次是2015-01至2017-01时间范围内占到标的的主体,(16,26]和(26,36],其中16-26为大学生群体
ggplot(aes(x = 历史成功借款金额, y = residual_principle/借款金额, color= age.bucket), data = df.lctrack) +
geom_point(alpha = 1/20, size = 1, position = 'jitter') +
geom_smooth()+
scale_x_log10()+
scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.1)) +
scale_color_brewer(palette="Set1",guide = guide_legend(title = '性别', reverse = F,
override.aes = list(alpha = 1, size = 2)))
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 87463 rows containing non-finite values (stat_smooth).
## Warning: Removed 47445 rows containing missing values (geom_point).
## Warning: Removed 22 rows containing missing values (geom_smooth).
多变项分析:14)各年龄层次是2015-01至2017-01时间范均随着历史成功借款金额增大,逾期率下降;历史借款金额低于1000元情况下,(16,26]年龄组逾期率最低
ggplot(aes(x = 借款利率, y = 借款金额, color= 是否首标), data = df.lctrack) +
geom_point(alpha = 1/20, size = 1, position = 'jitter') +
scale_y_log10() +
geom_smooth()+
scale_color_brewer(type = 'div',
guide = guide_legend(title = '是否首标', reverse = F,
override.aes = list(alpha = 1, size = 2)))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
多变项分析:15)作为首标的借款金额均会与低于10万元,且普遍利率要低于非首标的情况
ggplot(aes(x = 借款利率, y = residual_principle/借款金额, color= 是否首标), data = df.lctrack) +
geom_point(alpha = 1/20, size = 1, position = 'jitter') +
scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.1)) +
geom_smooth()+
scale_color_brewer(type = 'div',
guide = guide_legend(title = '是否首标', reverse = F,
override.aes = list(alpha = 1, size = 2)))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 47457 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_smooth).
多变项分析:16)大部分情况下首次标的的逾期金额占比借款金额要低于非首标的情况
ggplot(aes(x = 借款利率, y = 借款金额, color= age.bucket), data = df.lctrack) +
geom_point(alpha = 1/20, size = 1, position = 'jitter') +
scale_y_log10()+
geom_smooth()+
scale_color_brewer(type = 'div',
guide = guide_legend(title = '认证成功数', reverse = F,
override.aes = list(alpha = 1, size = 2)))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(aes(x = 借款利率, y = residual_principle/借款金额, color= age.bucket), data = df.lctrack) +
geom_point(alpha = 1/20, size = 1, position = 'jitter') +
scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.1)) +
geom_smooth()+
scale_color_brewer(type = 'div',
guide = guide_legend(title = '认证成功数', reverse = F,
override.aes = list(alpha = 1, size = 2)))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 47487 rows containing missing values (geom_point).
## Warning: Removed 33 rows containing missing values (geom_smooth).
ggplot(aes(x = 借款利率, y = residual_principle/借款金额, color = 是否首标), data = df.lctrack) +
geom_point(alpha = 1/20, size = 1, position = 'jitter') +
scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.1)) +
scale_x_continuous(limits = c(10,25),breaks = seq(10,25,1))+
geom_smooth()+
scale_color_brewer(palette="Set1",
guide = guide_legend(title = '认证成功数', reverse = F,
override.aes = list(alpha = 1, size = 2)))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 47526 rows containing missing values (geom_point).
## Warning: Removed 5 rows containing missing values (geom_smooth).
cor.test(df.lc$年龄, df.lc$借款金额, method = 'spearman')
## Warning in cor.test.default(df.lc$年龄, df.lc$借款金额, method =
## "spearman"): Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: df.lc$年龄 and df.lc$借款金额
## S = 4.3088e+15, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.2710632
与借款金额相关的因子:1.历史成功借款金额 2.是否首标 3.认证成功数 4.贷款类别 5.年龄层次 6.利率
与逾期金额/借款金额相关的因子:1.日期 2.认证成功数 3.历史成功借款金额 4.初始评级 5.是否首标 6.年龄层次 7.利率
多变项分析:1)初始评级与借款金额基本没有特定关系 多变项分析:2)在2016年1月前,各初始评级的逾期金额占比分布相对正常,之后初始评级失效 多变项分析:3)在2015-01到2017-01发行的标的,首标的平均借款金额逐渐超过非首标的平均借款金额 多变项分析:4)首标的逾期金额占比始终高于非首标的逾期金额占比 多变项分析:5)在2016-01之后,未进行认证的顾客显著增加 多变项分析:6)在2016-01之后,各个认证成功数的标的均出现逾期金额占比高企的情况 多变项分析:7)在历史借款金额越高,借款金额越高的基础上,B级的跨度区间最大,C级和D级标的金额和历史借款金额高度集中 多变项分析:8)在2016-01之前,逾期金额占比:其他>普通>电商,2016-01之后变化剧烈 多变项分析:9)在历史借款金额越高,借款金额越高的基础上,B级的跨度区间最大,C级和D级标的金额和历史借款金额高度集中 多变项分析:10)A-E初始评级的的逾期率均是先上升后下降 多变项分析:11)APP闪电贷和普通贷款分布最多,且金额集中 多变项分析:12)APP闪电、普通和其他贷款逾期金额占比,随着历史成功借款金额增多而增多;而电商贷款是随着历史成功借款金额的增多而减少 多变项分析:13)两个年龄层次是2015-01至2017-01时间范围内占到标的的主体,(16,26]和(26,36],其中16-26为大学生群体 多变项分析:14)各年龄层次是2015-01至2017-01时间范均随着历史成功借款金额增大,逾期率下降;历史借款金额低于1000元情况下,(16,26]年龄组逾期率最低 多变项分析:15)作为首标的借款金额均会与低于10万元,且普遍利率要低于非首标的情况 多变项分析:16)大部分情况下首次标的的逾期金额占比借款金额要低于非首标的情况
没有创建数据集模型 ```
### 绘图一
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
### 描述一 1.2015-01到2017-01发行成功的标的,各类借款类型均出现快速增长,其中2016年3月开始APP闪电贷快速发展,2016年11月开始电商借款消失 2.2015-01到2017-01发行成功的标的,各类初始评级贷款快速增长,其中C级和D级始终占据主体 3.2016-04之后,非首标贷款标的明显高于首标标的 4.2016-02以及之后发行成功的标的,逾期金额增长迅速
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
1)对比2016年2月之前成功发行的标的与之后发行的标的;历史成功借款金额继续与借款金额呈现正相关性 2)各初始评级分布基本前后一致,说明初始评级的标准未发生变化,且此标准与历史成功借款金额基本无关
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 18357 rows containing non-finite values (stat_smooth).
## Warning: Removed 18385 rows containing missing values (geom_point).
## Warning: Removed 36 rows containing missing values (geom_smooth).
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 68961 rows containing non-finite values (stat_smooth).
## Warning: Removed 29306 rows containing missing values (geom_point).
1)对比2016年2月之前成功发行的标的与之后发行的标的;历史成功借款金额继续与逾期金额占比借款金额呈现先上升后下降关系 2)根据数据推论出初始评级/信用评级在201602月开始发行的标的出现失效的状态,这可能与行业环境以及特殊借款群体有关 (例如大学生毕业后换手机号了,不再进行贷款偿还等情况)
1)该数据仅仅是拍拍贷的样本数据,并不能代表整体情况 2)对于标的的评级,没有建立该信用评级的依据说明 3)缺乏个人相关数据,无法直接获得个体正常还款和逾期的数据 4)一般针对还款行为,一般要区分还款意愿和还款能力,这两点没有数据支撑